

# to process data
library(tidyverse)
library(lubridate)
library(gdata)



# load workspaces with topic proportions, dictionary results
load("workspace_topic_models.Rdata")
load("workspace_dictionary_methods.Rdata")

# to use up less space, just keep the dataframe with topic proportions
keep(binding,results_binding,results_binding_dictionary, sure = T)


# load covariates (to merge with topic proportions)
load("covariates.RData")


####### TOPIC PROPORTIONS
# calculate average topic proportions for each agreement
results_binding2 <- results_binding %>%
  group_by(iso3c, id) %>%
  mutate(across(topic1_natres, list(full = ~ mean(.)))) %>%
  ungroup %>%
  select(iso3c,begin,end,duration,topic1_natres_full) %>%
  unique()

# now stretch them to every country-year under agreement
results_binding3 <- results_binding2 %>%
  select(iso3c, begin, end) %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols      = begin:end,
               names_to  = "start_end",
               values_to = "created") %>% 
  select(-"start_end") %>% 
  group_by(iso3c, id) %>% 
  mutate(created = as.Date(created)) %>%
  complete(created = seq.Date(min(created),max(created), by = "year")) %>% 
  separate(created, into = c("year", "month", "day"), sep = "-", remove = FALSE) %>% 
  mutate(year  = as.numeric(year),
         month = as.numeric(month),
         day   = as.numeric(day),
         begin = min(created),
         end   = max(created)) %>% 
  ungroup() %>% 
  select(-id) %>% 
  left_join(results_binding2, by = c("iso3c", "begin", "end")) %>%
  filter(duration>1 & year>1978) %>%
  unique()

# check for country-year duplicates (after all, countries might be under multiple agreements in one single year)
test <- results_binding3 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) %>%
  ungroup() #  402 duplicates

# we only want to use the agreement (and the corresponding conditions) that covered most days of the year
# we don't care about an agreement that covered 10 days of the year as much as we care about an agreement that covered 10 months, for example
test <- test %>%
  mutate(begin_year = year==year(begin),
         end_year = year==year(end),
         first_day = as.Date(ISOdate(test$year, 1, 1)),
         last_day = as.Date(ISOdate(test$year, 12, 31)),
         begin_last_day = ifelse(begin_year==TRUE, day(as.period(begin %--% last_day, unit="day")), NA),
         end_first_day = ifelse(end_year==TRUE, day(as.period(first_day %--% end, unit="day")), NA)) %>%
  group_by(iso3c,year) %>%
  mutate(keep = ifelse(!is.na(begin_last_day), begin_last_day,end_first_day),
         keep = ifelse(is.na(keep), 365, keep)) %>%
  slice(which.max(keep)) %>%
  ungroup() %>% 
  select(-c(begin_year,end_year,first_day,last_day,begin_last_day,end_first_day,keep))

results_binding4 <- results_binding3 %>%
  group_by(iso3c,year) %>%
  filter(n()<2) %>%
  ungroup()

# now bring it all together
results_binding4 <- rbind(results_binding4,test) %>%
  select(iso3c,year,topic1_natres_full)

results_binding4 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) # no country-year duplicates now!




####### DICTIONARY ANALYSIS - ABSOLUTE
# calculate total number of absolute mentions for each agreement
results_binding_dictionary2 <- results_binding_dictionary %>%
  group_by(id) %>%
  mutate(across(resources_abs, list(total = ~ sum(.)))) %>%
  ungroup %>%
  select(iso3c,begin,end,duration,resources_abs_total) %>%
  unique()


# now stretch them to every country-year under agreement
results_binding_dictionary3 <- results_binding_dictionary2 %>%
  select(iso3c, begin, end) %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols      = begin:end,
               names_to  = "start_end",
               values_to = "created") %>% 
  select(-"start_end") %>% 
  group_by(iso3c, id) %>% 
  mutate(created = as.Date(created)) %>%
  complete(created = seq.Date(min(created),max(created), by = "year")) %>% 
  separate(created, into = c("year", "month", "day"), sep = "-", remove = FALSE) %>% 
  mutate(year  = as.numeric(year),
         month = as.numeric(month),
         day   = as.numeric(day),
         begin = min(created),
         end   = max(created)) %>% 
  ungroup() %>% 
  select(-id) %>% 
  left_join(results_binding_dictionary2, by = c("iso3c", "begin", "end")) %>%
  filter(duration>1 & year>1978) %>%
  unique()

# check for country-year duplicates (after all, countries might be under multiple agreements in one single year)
test <- results_binding_dictionary3 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) %>%
  ungroup() #  402 duplicates

# we only want to use the agreement (and the corresponding conditions) that covered most days of the year
# we don't care about an agreement that covered 10 days of the year as much as we care about an agreement that covered 10 months, for example
test <- test %>%
  mutate(begin_year = year==year(begin),
         end_year = year==year(end),
         first_day = as.Date(ISOdate(test$year, 1, 1)),
         last_day = as.Date(ISOdate(test$year, 12, 31)),
         begin_last_day = ifelse(begin_year==TRUE, day(as.period(begin %--% last_day, unit="day")), NA),
         end_first_day = ifelse(end_year==TRUE, day(as.period(first_day %--% end, unit="day")), NA)) %>%
  group_by(iso3c,year) %>%
  mutate(keep = ifelse(!is.na(begin_last_day), begin_last_day,end_first_day),
         keep = ifelse(is.na(keep), 365, keep)) %>%
  slice(which.max(keep)) %>%
  ungroup() %>% 
  select(-c(begin_year,end_year,first_day,last_day,begin_last_day,end_first_day,keep))

results_binding_dictionary4 <- results_binding_dictionary3 %>%
  group_by(iso3c,year) %>%
  filter(n()<2) %>%
  ungroup()

# now bring it all together
results_binding_dictionary4 <- rbind(results_binding_dictionary4,test) %>%
  select(iso3c,year,resources_abs_total)

results_binding_dictionary4 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) # no country-year duplicates now!



####### DICTIONARY ANALYSIS - RELATIVE (TF-IDF)
# calculate average tf-idf for each agreement
results_binding_dictionary_tfidf2 <- results_binding_dictionary %>%
  group_by(id) %>%
  mutate(across(resources_tfidf, list(total = ~ mean(.)))) %>%
  ungroup %>%
  select(iso3c,begin,end,duration,resources_tfidf_total) %>%
  unique()


# now stretch them to every country-year under agreement
results_binding_dictionary_tfidf3 <- results_binding_dictionary_tfidf2 %>%
  select(iso3c, begin, end) %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols      = begin:end,
               names_to  = "start_end",
               values_to = "created") %>% 
  select(-"start_end") %>% 
  group_by(iso3c, id) %>% 
  mutate(created = as.Date(created)) %>%
  complete(created = seq.Date(min(created),max(created), by = "year")) %>% 
  separate(created, into = c("year", "month", "day"), sep = "-", remove = FALSE) %>% 
  mutate(year  = as.numeric(year),
         month = as.numeric(month),
         day   = as.numeric(day),
         begin = min(created),
         end   = max(created)) %>% 
  ungroup() %>% 
  select(-id) %>% 
  left_join(results_binding_dictionary_tfidf2, by = c("iso3c", "begin", "end")) %>%
  filter(duration>1 & year>1978) %>%
  unique()

# check for country-year duplicates (after all, countries might be under multiple agreements in one single year)
test <- results_binding_dictionary_tfidf3 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) %>%
  ungroup() #  402 duplicates

# we only want to use the agreement (and the corresponding conditions) that covered most days of the year
# we don't care about an agreement that covered 10 days of the year as much as we care about an agreement that covered 10 months, for example
test <- test %>%
  mutate(begin_year = year==year(begin),
         end_year = year==year(end),
         first_day = as.Date(ISOdate(test$year, 1, 1)),
         last_day = as.Date(ISOdate(test$year, 12, 31)),
         begin_last_day = ifelse(begin_year==TRUE, day(as.period(begin %--% last_day, unit="day")), NA),
         end_first_day = ifelse(end_year==TRUE, day(as.period(first_day %--% end, unit="day")), NA)) %>%
  group_by(iso3c,year) %>%
  mutate(keep = ifelse(!is.na(begin_last_day), begin_last_day,end_first_day),
         keep = ifelse(is.na(keep), 365, keep)) %>%
  slice(which.max(keep)) %>%
  ungroup() %>% 
  select(-c(begin_year,end_year,first_day,last_day,begin_last_day,end_first_day,keep))

results_binding_dictionary_tfidf4 <- results_binding_dictionary_tfidf3 %>%
  group_by(iso3c,year) %>%
  filter(n()<2) %>%
  ungroup()

# now bring it all together
results_binding_dictionary_tfidf4 <- rbind(results_binding_dictionary_tfidf4,test) %>%
  select(iso3c,year,resources_tfidf_total)

results_binding_dictionary_tfidf4 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) # no country-year duplicates now!


####### CALCULATE COUNT OF BINDING CONDITIONS
binding_count <- binding %>%
  select(iso3c,begin,end,duration,nat_resource_issue,binding) %>%
  group_by(iso3c,begin,end) %>%
  mutate(nat_resource_issue = sum(nat_resource_issue),
         conditions_per_agreement = sum(binding),
         nat_resource_issue_rel = nat_resource_issue/conditions_per_agreement) %>%
  ungroup() %>%
  unique() %>%
  select(-binding)


# now stretch this to every country-year under agreement
binding_count2 <- binding_count %>%
  select(iso3c, begin, end) %>% 
  mutate(id = row_number()) %>% 
  pivot_longer(cols      = begin:end,
               names_to  = "start_end",
               values_to = "created") %>% 
  select(-"start_end") %>% 
  group_by(iso3c, id) %>% 
  mutate(created = as.Date(created)) %>%
  complete(created = seq.Date(min(created),max(created), by = "year")) %>% 
  separate(created, into = c("year", "month", "day"), sep = "-", remove = FALSE) %>% 
  mutate(year  = as.numeric(year),
         month = as.numeric(month),
         day   = as.numeric(day),
         begin = min(created),
         end   = max(created)) %>% 
  ungroup() %>% 
  select(-id) %>% 
  left_join(binding_count, by = c("iso3c", "begin", "end")) %>%
  filter(duration>1 & year>1978) %>%
  unique()

# check for country-year duplicates (after all, countries might be under multiple agreements in one single year)
test <- binding_count2 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) %>%
  ungroup() #  402 duplicates

# we only want to use the agreement (and the corresponding conditions) that covered most days of the year
# we don't care about an agreement that covered 10 days of the year as much as we care about an agreement that covered 10 months, for example
test <- test %>%
  mutate(begin_year = year==year(begin),
         end_year = year==year(end),
         first_day = as.Date(ISOdate(test$year, 1, 1)),
         last_day = as.Date(ISOdate(test$year, 12, 31)),
         begin_last_day = ifelse(begin_year==TRUE, day(as.period(begin %--% last_day, unit="day")), NA),
         end_first_day = ifelse(end_year==TRUE, day(as.period(first_day %--% end, unit="day")), NA)) %>%
  group_by(iso3c,year) %>%
  mutate(keep = ifelse(!is.na(begin_last_day), begin_last_day,end_first_day),
         keep = ifelse(is.na(keep), 365, keep)) %>%
  slice(which.max(keep)) %>%
  ungroup() %>% 
  select(-c(begin_year,end_year,first_day,last_day,begin_last_day,end_first_day,keep))

binding_count3 <- binding_count2 %>%
  group_by(iso3c,year) %>%
  filter(n()<2) %>%
  ungroup()

# now bring it all together
binding_count3 <- rbind(binding_count3,test) %>%
  select(iso3c,year,nat_resource_issue,nat_resource_issue_rel) %>%
  mutate(resource_conditionality_dummy = ifelse(nat_resource_issue > 0, 1, 0))

binding_count3 %>%
  group_by(iso3c,year) %>%
  filter(n()>1) # no country-year duplicates now!


####### MERGE
full_df <- results_binding4 %>%
  full_join(covariates) %>%
  full_join(results_binding_dictionary4) %>%
  full_join(results_binding_dictionary_tfidf4) %>%
  full_join(binding_count3) %>%
  group_by(iso3c,country) %>%
  arrange(year, .by_group = T) %>%
  ungroup() %>%
  filter(year<2020) %>%
  select(country,iso3c,region,income,year,stabilization_law:pension_law,imf_program,topic1_natres_full,
         resources_abs_total,resources_tfidf_total,nat_resource_issue,nat_resource_issue_rel,resource_conditionality_dummy,
         imf_program_new:nationalization_oc) %>%
  mutate_at(vars(imf_program:imf_program_last_year), ~replace_na(., 0)) %>%
  rename(resource_conditionality = topic1_natres_full,
         resource_conditionality_dict_abs = resources_abs_total,
         resource_conditionality_dict_tfidf = resources_tfidf_total,
         resource_conditionality_count_abs = nat_resource_issue,
         resource_conditionality_count_rel = nat_resource_issue_rel)

# generate a time trend
time_trend <- data.frame(year = 1979:2019, t = 1:41)

# generate short-term vs long-term variables
full_df <- full_df %>%
  mutate(short_term = ifelse(stabilization_law==1 | investment_law==1 | development_law==1, 1, 0),
         long_term = ifelse(savings_law==1 | pension_law==1, 1, 0),
         any_fund = ifelse(short_term==1 | long_term==1, 1, 0),
         iso3c = as.factor(iso3c)) %>%
  group_by(iso3c) %>%
  arrange(year, .by_group = T) %>%
  mutate(across(short_term:any_fund, list(lead = ~ lead(.,1))),
         across(resource_conditionality, list(lead = ~ lead(.,1))),
         cumulative_short_term = cumsum(lag(short_term, n = 1, default = 0)),
         previous_short_term  = ifelse(cumulative_short_term>0, 1, 0),
         cumulative_long_term = cumsum(lag(long_term, n = 1, default = 0)),
         previous_long_term  = ifelse(cumulative_long_term>0, 1, 0)) %>%
  ungroup() %>%
  mutate(log_gdp_pc_constant = log(gdp_pc_constant),
         iso3c = as.factor(iso3c),
         any_war = ifelse(independence_war_mepv==1 | international_war_mepv==1 | civil_war_mepv==1 | ethnic_var_mepv==1, 1, 0),
         any_war = ifelse(is.na(any_war), 0, any_war),
         wb_extractive_project = ifelse(is.na(wb_extractive_project), 0, 1),
         left_executive = as.numeric(left_executive),
         election = as.numeric(election),
         nationalization_oc = as.numeric(nationalization_oc),
         log_external_debt_stocks = log(external_debt_stocks),
         field_discovery = ifelse(is.na(field_discovery), 0, field_discovery)) %>%
  left_join(time_trend) %>% # add time trend
  rename(ethnic_war_mepv = ethnic_var_mepv) 


save(full_df, file = "full_data_conditionality_covariates.RData")  


#### SUBSET DATA FOR MAIN ANALYSIS
glm_df <- full_df %>%
  select(country,iso3c,year,short_term,long_term,previous_short_term,previous_long_term,
         imf_program,resource_conditionality,resource_conditionality_dict_tfidf,
         resource_conditionality_count_abs,resource_conditionality_count_rel,resource_conditionality_dummy,
         resource_rents,log_gdp_pc_constant,gdp_growth,field_discovery,
         oil_price,crisis,polity2,left_executive,election,any_war,ideal_point_usa,eiti_member,
         nationalization_oc,wb_extractive_project,working_age_pop,log_external_debt_stocks,t) %>%
  mutate(resource_conditionality = resource_conditionality * 100,
         resource_conditionality_count_rel = resource_conditionality_count_rel * 100) %>%
  group_by(iso3c) %>%
  arrange(year, .by_group = T) %>%
  mutate(across(imf_program:log_external_debt_stocks, list(lag = ~ lag(.,1)))) %>%
  ungroup() %>%
  filter(year>1979)


save(glm_df, file = "full_data_glm.RData")

iv_df <- full_df %>%
  select(country,iso3c,year,short_term,long_term,previous_short_term,previous_long_term,
         imf_program,resource_conditionality,resource_conditionality_dict_tfidf,
         imf_liquidity_ratio,disbursement,unsc,imf_staff_percent,
         resource_rents,log_gdp_pc_constant,gdp_growth,field_discovery,
         oil_price,crisis,polity2,left_executive,election,any_war,ideal_point_usa,eiti_member,
         nationalization_oc,wb_extractive_project,working_age_pop,t) %>%
  mutate(resource_conditionality = resource_conditionality * 100) %>%
  group_by(iso3c) %>%
  arrange(year, .by_group = T) %>%
  mutate(across(imf_program:working_age_pop, list(lag = ~ lag(.,1)))) %>%
  ungroup() %>%
  filter(year>1979)

save(iv_df, file = "full_data_iv.RData")
